From Evan: We have 3 files designated HIGH/MODERATE/LOW that are lists of numbers. These numbers correspond to the SNP numbers (so 1 corresponds to the first row of data in the counts file). The numbers in the HIGH file were categorized as high impact, so stuff like premature stop codons and stuff. Details are in the snpEff manual. There's more information that I can pull for each SNP, like what exactly the effect is and if it's in UTR/coding etc...
Here are how many we have in each category:
table(effect)
## effect
## none low moderate high
## 4951495 30251 17494 760
We will look at, by SNP category:
First let's check that everything looks reasonable and pick some coverage cutoffs. Here are total coverage histograms by type:
all_hist <- hist(pos$totDepth[pos$totDepth<=1000], plot=FALSE, breaks=50)
depth_hists <- tapply(pmin(1000,pos$totDepth), effect, hist, breaks=all_hist$breaks, plot=FALSE)
plot(0, type='n', xlim=range(all_hist$breaks), ylim=range(lapply(depth_hists, "[[", "density")),
xlab="total coverage", ylab="density")
for (k in seq_along(depth_hists)) {
lines(depth_hists[[k]]$mids, depth_hists[[k]]$density, col=effect_cols[k])
}
legend("topright", legend=levels(effect), lty=1, col=effect_cols)
abline(v=c(min_coverage, max_coverage), lty=3)
We'll restrict to sites with total coverage between 150 and 500 (the vertical lines in the previous plot). Within these bounds, do the different catgories have statistically significantly different mean total coverages?
good_snps <- with(pos, totDepth > min_coverage & totDepth < max_coverage)
summary(lm(pos$totDepth ~ effect, subset=good_snps))
## Warning: closing unused connection 5
## (272torts_snp1e6_minmapq20minq30_map2kenro10K.first5milSNPs.counts.bin.gz)
##
## Call:
## lm(formula = pos$totDepth ~ effect, subset = good_snps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -171.064 -48.064 7.936 53.936 199.294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 322.06355 0.03459 9309.649 <2e-16 ***
## effectlow -22.35732 0.43449 -51.456 <2e-16 ***
## effectmoderate -13.18378 0.57041 -23.113 <2e-16 ***
## effecthigh -5.99337 2.87054 -2.088 0.0368 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71.87 on 4360444 degrees of freedom
## Multiple R-squared: 0.0007275, Adjusted R-squared: 0.0007268
## F-statistic: 1058 on 3 and 4360444 DF, p-value: < 2.2e-16
Yes - SNPs with an effect have lower coverage, but the difference decreases with SNP effect. This looks to be due to a bimodality in total coverage, with decreasing contributions of the lower mode to larger effect SNPs.
To-do: get actual scaffold lengths.
Here are mean coverages by scaffold:
hist(scaffolds$mean_coverage, breaks=50, main='mean coverage by scaffold')
plot(scaffolds$length, scaffolds$nsnps, pch=20, xlab='scaffold length (bp)', ylab='# of snps')
layout(t(1:2))
plot(scaffolds$length, scaffolds$mean_coverage, pch=20, xlab='scaffold length (bp)',
ylab='mean coverage')
plot(scaffolds$nsnps, scaffolds$mean_coverage, pch=20, xlab='# of snps by scaffold',
ylab='mean coverage')
This range is much higher than expected due to chance.
And, here is a plot of mean coverage along the genome:
alg_win( pos$totDepth[good_snps], window=1e5, subset=good_snps, ylab='mean coverage')
To get coverages by individual and by scaffold, we've used
../count-utils/get-coverage-by-scaffold.R 272torts_snp1e6_minmapq20minq30_map2kenro10K.counts.bin.gz 150 500 272torts_snp1e6_minmapq20minq30_map2kenro10K.coverage_by_scaffold.gz
Let \(C_{xi}\) be the total number of genotyped alleles of individual \(i\) mapping to scaffold \(x\), which is what is computed by get-coverage-by-scaffold.R above. First, we normalize this, dividing by individual mean coverage: \(R_{xi} = C_{xi} / \mu_i\), where \(\mu_i\) is the total coverage of individual \(i\). There is significant sharing of patterns of coverage between scaffolds: we see this by computing the covariance matrix between scaffolds of relative per-individual coverages, and doing PCA on this covariance matrix.
rel_scaf_coverage <- sweep(scaf_coverage, 2, indiv_mean_coverage, "/")
# this is the SAME THING as scaffolds$mean_coverage
# scaffolds$relative_coverage <- rowMeans(scaf_coverage)
coverage_cov <- cov(t(rel_scaf_coverage))
scaf_pcs <- eigen(coverage_cov)
pairs(scaf_pcs$vectors[,1:4], main="PCs of coverage by scaffold")
head(scaf_pcs$values)
## [1] 5.51964640 0.66130709 0.49499394 0.14849332 0.13136233 0.09366259
scaffolds$scafPC1 <- scaf_pcs$vectors[,1]
layout(t(1:2))
plot(scaffolds$scafPC1, scaffolds$mean_coverage, xlab='scaffold PC1', ylab='mean coverage')
plot(scaffolds$scafPC1, scaffolds$mean_coverage_noQC, xlab='scaffold PC1', ylab='mean coverage without QC', ylim=c(0,1000))
The first PC explains 0.9740414 of the variance, and is highly correlated with mean coverage at SNPs passing our coverage cutoffs.
In this plot, there is one line per individual, and the x-axis gives the scaffolds, ordered by PC1.
scaf_ord <- order(scaf_pcs$vectors[,1])
matplot(rel_scaf_coverage[scaf_ord,], type='l',
xlab='scaffold, ordered by PC1',
ylab='coverage relative to individual mean' )
Which individuals are showing different patterns? We measure this by taking the correlation of an individual's coverage pattern with the scaffold PC1:
indivs$scafPC <- as.vector(cor(scaf_pcs$vectors[,1], rel_scaf_coverage))
scaf_cor_cutoff <- -0.7
# scaf_cor_colors <- rgb(colorRamp(brewer.pal(name='RdYlBu',n=3),alpha=1.0)((1+indivs$scafPC)/2)/255)
scaf_cor_colors <- adjustcolor(ifelse(indivs$scafPC>scaf_cor_cutoff,
ifelse(indivs$scafPC>0,'blue','orange'), 'red'),0.5)
scaf_cor_cex <- pmax(0.5,3*abs(indivs$scafPC))
layout(t(1:2))
plot(indivs$scafPC, xlab='individual', ylab='correlation',
col=scaf_cor_colors, cex=scaf_cor_cex, pch=20,
main='correlation of coverage with scaffold PC1')
abline(h=scaf_cor_cutoff)
pshade()
points(tort.coords, pch=20,
cex=scaf_cor_cex,
col=scaf_cor_colors)
pshade(xlim=.ivanpah.bbox["Easting",], ylim=.ivanpah.bbox["Northing",])
points(tort.coords, pch=20,
cex=scaf_cor_cex,
col=scaf_cor_colors)
Let's look at some of these wierd scaffolds. Here is coverage on the top few, normalized by mean per-individual coverage on each scaffold, plotted in two different ways: Note that problematic high-coverage regions have a lot of SNPs in a small window. (Also: marks on the bottom denote SNPs that we throw out due to wierd coverage.)
scaffolds[head(order(scaf_pcs$vectors[,1], decreasing=TRUE)),]
## name start end length nsnps mean_coverage
## 997 scaffold_1086 69 16840 16771 287 261.1777
## 104 scaffold_115 37 16919 16882 192 255.0156
## 348 scaffold_376 5 21475 21470 420 241.2643
## 561 scaffold_607 18 22960 22942 264 247.1553
## 992 scaffold_1080 81 47198 47117 538 239.7751
## 578 scaffold_626 4 10202 10198 118 216.4746
scaf_to_coverage <- function (x) {
As <- which(1:ncol(x) %% 4 == 1)
out <- x[,As]
for (k in 1:3) { out <- out + x[,As+k] }
return(out)
}
get_coverage <- function (scafname) {
scaf_to_coverage(as.matrix(read.table(pipe(sprintf("../count-utils/get-scaffold.R %s %s", bincountfile, scafname)))))
}
selfname <- function (x) { names(x) <- x; x }
scafs <- lapply(selfname(as.character(scaffolds$name[head(order(scaf_pcs$vectors[,1], decreasing=TRUE))])), get_coverage)
mid_scafs <- lapply(selfname(as.character(scaffolds$name[order(scaf_pcs$vectors[,1], decreasing=TRUE)[503:504]])), get_coverage)
show_scaf <- function (scafname, this_coverage) {
matplot(subset(pos,chr==scafname)$pos, apply(this_coverage, 2, convolve, y=c(rep(1,20)/20,rep(0,nrow(this_coverage)-20))),
xlab='position (bp)', ylab='individual',
main='running mean of coverage by individual for 20 SNPs', type='l')
rug(subset(pos,chr==scafname & (totDepth<min_coverage | totDepth > max_coverage))$pos)
rel_coverage <- sweep(this_coverage, 2, colMeans(this_coverage), "/")
# rel_coverage <- sweep(rel_coverage, 1, rowMeans(rel_coverage), "/")
image(subset(pos,chr==scafname)$pos, z=rel_coverage, xlab='position (bp)', ylab='individual', yaxt='n', main=paste("relative coverage", scafname))
# plot(row(rel_coverage), col(rel_coverage), cex=rel_coverage/colMeans(rel_coverage)/10, col=adjustcolor('black',0.5), xlab='SNP number', ylab='individual', yaxt='n')
matplot(subset(pos,chr==scafname)$pos, apply(rel_coverage, 2, convolve, y=c(rep(1,20)/20,rep(0,nrow(rel_coverage)-20))),
xlab='position (bp)', ylab='individual',
main='running mean of relative coverage by individual for 20 SNPs', type='l')
rug(subset(pos,chr==scafname & (totDepth<min_coverage | totDepth > max_coverage))$pos)
invisible(rel_coverage)
}
layout(t(1:3))
for (k in seq_along(scafs)) { show_scaf(names(scafs)[[k]], scafs[[k]]) }
Here's some scaffolds from the middle of the pack, for comparison:
layout(t(1:3))
for (k in seq_along(mid_scafs)) { show_scaf(names(mid_scafs)[[k]], mid_scafs[[k]]) }